#!/bin/bash
#==========  LSF 选项  ==========
#BSUB -q fat_768
#BSUB -n 80
#BSUB -R "span[hosts=1]"
#BSUB -J ppo_cpu_chatglm3
#BSUB -o /share/home/zhangshanqi/QSM/cpu/trl/logs/ppo_cpu_chatglm3.%J.out
#BSUB -e /share/home/zhangshanqi/QSM/cpu/trl/logs/ppo_cpu_chatglm3.%J.err

#==========  运行环境  ==========
cd /share/home/zhangshanqi/QSM/cpu/trl
source /share/home/zhangshanqi/QSM/RLHF/miniconda3/etc/profile.d/conda.sh
conda activate qsm

#==========  线程与亲和性  ==========
export OMP_NUM_THREADS=80
export MKL_NUM_THREADS=80
export NUMEXPR_NUM_THREADS=80
export OPENBLAS_NUM_THREADS=80
export MP_TASK_AFFINITY=core:${OMP_NUM_THREADS}

#==========  关闭 HuggingFace 外网探测  ==========
export HF_HUB_OFFLINE=1
export HF_HUB_DISABLE_TELEMETRY=1

#==========  启动 PPO 训练  ==========
accelerate launch \
  --config_file /share/home/zhangshanqi/QSM/cpu/trl/default_config.yaml \
  examples/scripts/ppo/ppo.py \
  --dataset_name csv \
  --dataset_train_split train \
  --learning_rate 1e-6 \
  --num_ppo_epochs 3 \
  --num_mini_batches 1 \
  --output_dir /scratch/$USER/融合ppo_outputzhengshibanben02 \
  --per_device_train_batch_size 1 \
  --gradient_accumulation_steps 8 \
  --total_episodes 3000 \
  --model_name_or_path /scratch/$USER/专家ppo002002 \
  --sft_model_path     /scratch/$USER/专家ppo002002 \
  --reward_model_path  /scratch/$USER/00104reward_model_pairwise_fp16 \
  --missing_eos_penalty 1.0

